miceEmile Latour
February 17, 2021
Here we load that packages that we will be using during the analysis. See section titled “Links and references” for more info on the packages
library(tidyverse) # packages ggplot2, dplyr, tidyr, readr, purrr, tibble,
# stringr, and forcats
library(broom) # functions tidy(), glance(), augment()
library(janitor) # for working with dirty data
library(here) # simpler way to find your files
library(skimr) # Compact and Flexible Summaries of Data
library(DT) # Helpful for looking at large data frames
library(mice) # Multiple imputation using Fully Conditional Specification
library(naniar) # structures, summaries, and visualizations for missing data
library(VIM) # Visualization and Imputation of Missing Values
library(finalfit) # Check for associations between missing and observed data.A simulated data set of Medicaid enrollment data that has been matched with Electronic Health Records (EHR) for the purposes of comparing the Medicaid claims data with the EHR to ensure accuracy/agreement.
The data contains:
To provide a practical real-world example, he data used here was based on an actual data set, but due to complications with using patient data and privacy, all data here has been simulated and similarities to the original data are unintentional.
Using the here package to construct the path to the data set. Specifying the column types ahead of time.
data <- readr::read_csv(
file = here::here("data",
"ehr-and-claims_simulated-data_csp-2021.csv"),
col_types = cols(pat_id = col_character(),
age = col_double(),
sex = col_character(),
race_eth = col_character(),
language = col_character(),
fpl = col_character(),
age_f = col_character(),
breast_eligibility = col_double(),
flu_eligibility = col_double(),
cholesterol_eligibility = col_double(),
breast_claims = col_double(),
flu_claims = col_double(),
cholesterol_claims = col_double(),
breast_ehr = col_double(),
flu_ehr = col_double(),
cholesterol_ehr = col_double(),
cholesterol_total = col_double()))Take note of
## Rows: 13,101
## Columns: 17
## $ pat_id <chr> "05094", "05711", "01013", "11608", "12443", …
## $ age <dbl> NA, 48, 26, 59, 61, 34, 54, 59, 50, 40, NA, 6…
## $ sex <chr> "Female", "Female", "Female", "Male", "Female…
## $ race_eth <chr> "Non-Hispanic, White", "Non-Hispanic, White",…
## $ language <chr> "English", "English", "English", "English", "…
## $ fpl <chr> "<=138% FPL", "<=138% FPL", "<=138% FPL", "<=…
## $ age_f <chr> NA, "35-<50", "19-<34", "51-<64", "51-<64", "…
## $ breast_eligibility <dbl> 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, …
## $ flu_eligibility <dbl> 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, …
## $ cholesterol_eligibility <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ breast_claims <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ flu_claims <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ cholesterol_claims <dbl> 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, …
## $ breast_ehr <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ flu_ehr <dbl> 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ cholesterol_ehr <dbl> 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, …
## $ cholesterol_total <dbl> 152.9439, NA, NA, 193.5783, NA, 226.7804, 244…
Convert coded variables to factor type.
data <- data %>%
mutate(sex = factor(sex,
levels = c("Female", "Male")),
race_eth = factor(race_eth,
levels = c("Hispanic",
"Non-Hispanic, Black",
"Non-Hispanic, Other",
"Non-Hispanic, White")),
language = factor(language,
levels = c("English",
"Spanish",
"Other")),
fpl = factor(fpl,
levels = c("<=138% FPL",
">138% FPL")),
age_f = factor(age_f,
levels = c("19-<34",
"35-<50",
"51-<64")),
dplyr::across(.cols = dplyr::ends_with("_eligibility"),
.fns = ~ factor(.,
levels = c(1, 0),
labels = c("Yes", "No"))),
dplyr::across(.cols = dplyr::ends_with("_claims"),
.fns = ~ factor(.,
levels = c(1, 0),
labels = c("Yes", "No"))),
dplyr::across(.cols = dplyr::ends_with("_ehr"),
.fns = ~ factor(.,
levels = c(1, 0),
labels = c("Yes", "No")))
)
# Good step I always do to ensure consistent, clean naming conventions
data <- data %>%
janitor::clean_names()Take another glimpse at the data and note how the values have changed.
## Rows: 13,101
## Columns: 17
## $ pat_id <chr> "05094", "05711", "01013", "11608", "12443", …
## $ age <dbl> NA, 48, 26, 59, 61, 34, 54, 59, 50, 40, NA, 6…
## $ sex <fct> Female, Female, Female, Male, Female, Male, M…
## $ race_eth <fct> "Non-Hispanic, White", "Non-Hispanic, White",…
## $ language <fct> English, English, English, English, Spanish, …
## $ fpl <fct> <=138% FPL, <=138% FPL, <=138% FPL, <=138% FP…
## $ age_f <fct> NA, 35-<50, 19-<34, 51-<64, 51-<64, 19-<34, 5…
## $ breast_eligibility <fct> Yes, Yes, No, No, Yes, No, No, Yes, No, No, N…
## $ flu_eligibility <fct> Yes, No, No, Yes, Yes, No, Yes, Yes, No, No, …
## $ cholesterol_eligibility <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, …
## $ breast_claims <fct> Yes, No, No, No, No, No, No, Yes, No, No, No,…
## $ flu_claims <fct> No, No, No, Yes, No, No, Yes, No, No, No, No,…
## $ cholesterol_claims <fct> Yes, No, No, Yes, Yes, Yes, Yes, No, No, No, …
## $ breast_ehr <fct> Yes, No, No, No, No, No, No, No, No, No, No, …
## $ flu_ehr <fct> No, No, Yes, Yes, No, Yes, Yes, No, No, No, N…
## $ cholesterol_ehr <fct> Yes, No, No, Yes, No, Yes, Yes, No, No, No, Y…
## $ cholesterol_total <dbl> 152.9439, NA, NA, 193.5783, NA, 226.7804, 244…
skimr::skim() is a nice alternative to the base R function summary(). Though it doesn’t always play nicely depending on your operating system…
# With spark graphs if system allows
# skimr::skim(data)
# Without spark graphs
skimr::skim_without_charts(data)| Name | data |
| Number of rows | 13101 |
| Number of columns | 17 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| factor | 14 |
| numeric | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| pat_id | 0 | 1 | 5 | 5 | 0 | 13101 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| sex | 1313 | 0.90 | FALSE | 2 | Fem: 7759, Mal: 4029 |
| race_eth | 551 | 0.96 | FALSE | 4 | Non: 8897, Non: 1406, His: 1226, Non: 1021 |
| language | 0 | 1.00 | FALSE | 3 | Eng: 10948, Oth: 1523, Spa: 630 |
| fpl | 2813 | 0.79 | FALSE | 2 | <=1: 10062, >13: 226 |
| age_f | 1947 | 0.85 | FALSE | 3 | 19-: 4306, 35-: 4095, 51-: 2753 |
| breast_eligibility | 0 | 1.00 | FALSE | 2 | No: 8845, Yes: 4256 |
| flu_eligibility | 0 | 1.00 | FALSE | 2 | No: 9424, Yes: 3677 |
| cholesterol_eligibility | 0 | 1.00 | FALSE | 2 | Yes: 12797, No: 304 |
| breast_claims | 0 | 1.00 | FALSE | 2 | No: 11467, Yes: 1634 |
| flu_claims | 0 | 1.00 | FALSE | 2 | No: 11856, Yes: 1245 |
| cholesterol_claims | 0 | 1.00 | FALSE | 2 | No: 7775, Yes: 5326 |
| breast_ehr | 0 | 1.00 | FALSE | 2 | No: 11550, Yes: 1551 |
| flu_ehr | 0 | 1.00 | FALSE | 2 | No: 8789, Yes: 4312 |
| cholesterol_ehr | 0 | 1.00 | FALSE | 2 | No: 8042, Yes: 5059 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| age | 1947 | 0.85 | 39.68 | 12.69 | 19.00 | 29.00 | 39.00 | 49.0 | 64.00 |
| cholesterol_total | 8042 | 0.39 | 199.63 | 42.90 | 47.23 | 170.67 | 199.48 | 227.7 | 353.94 |
One of the most important and involved steps in performing multiple imputation. See slide deck on performing EDA of data with missingness.
mice packageThe aims of this session are to
mice package,mice algorithm,mice functionality.Set the seed to ensure that all results, figures, etc. are reproducible.
Tune the imputation using the default settings to begin with.
The mice algorithm tends to converge rather quickly. Advice is to set the iterations high enough that you see convergence but not so high that it will drastically increase the computation time.
Once the imputation model is specified and convergence issues are addressed, increase the number of imputations for the final model. Allison suggests: the number of imputations should be similar to the percentage of cases that are incomplete. This can also make for long computation time, which is an appropriate price to pay for imputation with large missing.
In our data set the percentage of cases that are incomplete was:
## # A tibble: 1 x 3
## df var case
## <dbl> <dbl> <dbl>
## 1 0.0746 0.353 0.785
Stick with the defaults for now.
Start with an “initial” empty imputation to see what the software does by default. Also, this provides some objects to help in setting up the final imputation model.
See what is contained in the init object that contains the imputations and data about the setup.
## Warning: Number of logged events: 1
## [1] "data" "imp" "m" "where"
## [5] "blocks" "call" "nmis" "method"
## [9] "predictorMatrix" "visitSequence" "formulas" "post"
## [13] "blots" "ignore" "seed" "iteration"
## [17] "lastSeedValue" "chainMean" "chainVar" "loggedEvents"
## [21] "version" "date"
We already see a warning which is due to potential collinearity between variables in our data set: age and age_f.
We are most interested in the methods, predictorMatrix, and the visitSequence which we will work with and adjust to help set up the final imputation models.
methodsThese are the default model specifications that the software chose based on data type.
## age sex race_eth fpl
## "pmm" "logreg" "polyreg" "logreg"
## age_f cholesterol_total
## "polyreg" "pmm"
We can see the defaults that the package chooses for the variables with missing data:
pmm), numeric variable type.logreg), factor with 2 levels.polyreg), factor with > 2 levels.logreg), factor with 2 levels.polyreg), factor with > 2 levels.See link to Van Buuren’s text online for all available.
According to Van Buuren: pmm is robust to transformation of target variables and it can handle discrete variables. pmm will not impute outside of the observed data range.
Predictive mean matching calculates the predicted value of target variable Y according to the specified imputation model. For each missing entry, the method forms a small set of candidate donors (typically with 3, 5 or 10 members) from all complete cases that have predicted values closest to the predicted value for the missing entry. One donor is randomly drawn from the candidates, and the observed value of the donor is taken to replace the missing value. The assumption is the distribution of the missing cell is the same as the observed data of the candidate donors. (Van Buuren, Flexible Imputation of Missing Data)
predictorMatrix1s and 0s indicate which predictors were selected by the softwarevisitSequenceWe can also check the visit sequence to see the order that variables are “visited” in the imputation process. Default is the order of the variables in the data frame.
## [1] "pat_id" "age"
## [3] "sex" "race_eth"
## [5] "language" "fpl"
## [7] "age_f" "breast_eligibility"
## [9] "flu_eligibility" "cholesterol_eligibility"
## [11] "breast_claims" "flu_claims"
## [13] "cholesterol_claims" "breast_ehr"
## [15] "flu_ehr" "cholesterol_ehr"
## [17] "cholesterol_total"
In Van Buuren’s vignette and book, he lays out steps to follow when setting up multiple imputation.
Here we will go through the process with our example data to
Assuming MAR is typically a reasonable place to start. There is literature on sensitivity analysis with the imputations to examine if this assumption is met. And there are techniques to model the missing mechanism with the imputation if there is violation. This work is outside the scope of what I hope to share here.
Exploring the missingness is so important for this first step!
We want to decide the form of the model used to impute the missing values of each variable. We saved the default choices from before.
## age sex race_eth fpl
## "pmm" "logreg" "polyreg" "logreg"
## age_f cholesterol_total
## "polyreg" "pmm"
These look pretty reasonable given what we know of our data set. We could change the settings to the meth object if we liked:
## age sex race_eth fpl
## "norm.nob" "logreg" "polyreg" "logreg"
## age_f cholesterol_total
## "polyreg" "pmm"
Change it back before proceeding since we want to use predictive mean matching for continuous data.
## age sex race_eth fpl
## "pmm" "logreg" "polyreg" "logreg"
## age_f cholesterol_total
## "polyreg" "pmm"
What variables to include in the multiple imputation model?
The advice is to include as many relevant variables as possible. One should include all variables that are in your scientific model of interest that will be used after imputation. Also variables that are related to the missingness of the variables you are imputing (i.e. those known to be related to nonresponse). Van Buuren has more advice on this and worth reading, vignette and book
Including as many predictors as possible makes the MAR assumption more reasonable. But with larger data sets, this is not advisable for computation purposes. Van Buuren suggests that 15 to 25 variables will work well. He also offers advice to cull that list.
Discuss with collaborators and lead researchers with domain knowledge of your data
I came across these from some class notes online:
The imputation model should include variables that are:
- crucial to the analysis
- highly predictive of variables crucial to the analysis
- highly predictive of missingness
- describe special features of the sample design
The model should be general enough to preserve any associations among variables (two-, three-, or higher- way associations) that may be of interest in later analyses.
In our case, we are on our own. To aid in these decisions the mice package has a function that produces a “quick predictor matrix” that is useful for dealing with data sets with large number of variables. The software chooses by calculating two correlations with the available cases, taking the larger, and seeing if it meets a minimum threshold. Type ?mice::quickpred in the R console for better description.
Below I run the quickpred() to see what the software chooses. I just show the rows and columns
In my work, I’ve used a combination of the predictor matrix and collaborators input to select predictors for the imputation model.
Note that the models can have different sets of predictors.
From the help details section for mice::quickpred:
The function is designed to aid in setting up a good imputation model for data with many variables.
Basic workings: The procedure calculates for each variable pair (i.e. target-predictor pair) two correlations using all available cases per pair. The first correlation uses the values of the target and the predictor directly. The second correlation uses the (binary) response indicator of the target and the values of the predictor. If the largest (in absolute value) of these correlations exceeds mincor, the predictor will be added to the imputation set. The default value for mincor is 0.1.
In addition, the procedure eliminates predictors whose proportion of usable cases fails to meet the minimum specified by minpuc. The default value is 0, so predictors are retained even if they have no usable case.
Finally, the procedure includes any predictors named in the include argument (which is useful for background variables like age and sex) and eliminates any predictor named in the exclude argument. If a variable is listed in both include and exclude arguments, the include argument takes precedence.
Ensure that patient ID (pat_id) doesn’t get included as a predictor in any of the models.
In the slides on EDA we saw that:
race_eth look like good predictors for age (96.6%).race_eth look like good predictors for sex (96.7%).age and sex look like good predictors for race_eth (88.0% and 92.2%).age and sex and race_eth look like good predictors for fpl (89.5% and 93.1% and 95.8%).race_eth, and sex would be potentially helpful.Transformations, sum scores, etc. were not used in this data set so not much to consider here. In some cases, there can be a lot to think about particularly if a variable is transformed solely to meet the normality assumption in a regression model. So do a literature review if this issue applies to you.
Need to handle the derived variable age_f which is calculated from age. See chapter in Van Buuren text for a few different methods. Here I will use “passive imputation” to impute age_f as a derived variable.
# User defined function to create age_f, a categorical version of the continuous
# age variable
calc_age_f <- function(x) {
cut(x,
breaks = c(19, 35, 50, 65),
right = FALSE,
labels = c('19-<34', '35-<50', '51-<64'))
}
meth["age_f"] <- "~I(calc_age_f(age))"
meth[meth != ""]## age sex race_eth
## "pmm" "logreg" "polyreg"
## fpl age_f cholesterol_total
## "logreg" "~I(calc_age_f(age))" "pmm"
Since it’s based on other variables in the data, there will be collinearity in the imputation model when age_f is included. The software should detect this automatically, give a warning, and drop the variable automatically. Best to ensure that it is not included in the imputation model anyways.
I also don’t want to impute cholesterol_total or include it in any of my imputation models.
The default in the software goes by appearance in the data set left to right. It can be overwritten per the user’s direction. This becomes more of an issue if there is a longitudinal nature to the data set where missingness at an earlier time point would affect the missingness later. So impute early to later.
I will examine the imputation order by magnitude of missingness (low to high and high to low). To see if there is a difference in performance or convergence or any impact to the estimates. I usually default to imputing from highest percent missing to lowest.
## [1] "pat_id" "age"
## [3] "sex" "race_eth"
## [5] "language" "fpl"
## [7] "age_f" "breast_eligibility"
## [9] "flu_eligibility" "cholesterol_eligibility"
## [11] "breast_claims" "flu_claims"
## [13] "cholesterol_claims" "breast_ehr"
## [15] "flu_ehr" "cholesterol_ehr"
## [17] "cholesterol_total"
Override this by ordering based on magnitude of missing that we got from the EDA
## # A tibble: 6 x 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 cholesterol_total 8042 61.4
## 2 fpl 2813 21.5
## 3 age 1947 14.9
## 4 age_f 1947 14.9
## 5 sex 1313 10.0
## 6 race_eth 551 4.21
## [1] "fpl" "age" "age_f" "sex" "race_eth"
This is to ensure convergence. The default is 5. 10 to 20 are recommended.
## [1] 5
The rule of thumb from more recent authors is that the number of imputations should be similar to the percentage of cases (observations) that are incomplete (at least 5).
The software default is 5 and I typically use that when running for the first time and when checking convergence to save time.
We had set this previously above.
## [1] 5
All the setup work has been done and considerations made. Using the specifications that saved in objects above, we will run the mice command to impute the data sets.
imp <- mice::mice(data = data,
m = imp_num, # number of imputations, dflt = 5
method = meth, # specify the method
predictorMatrix = pred_guess,
visitSequence = visit_order,
seed = seed_for_imp,
maxit = iter_num, # number of iterations, dflt = 5
print = FALSE)## NULL
We plot the values of mean and standard deviation for each variable by number of iteration. We want to look for mixing of the lines and we do not want to see any trend.
I tend to think about issues with convergence similar to those that come up with any kind of modeling specifically combinations of categorical predictors that cause (almost perfect) data separation.
Hard to tell from just 5 iterations. But checking above there was nothing that immediately jumped out. Let’s run for more iterations to make sure and see when things start to settle down if at all.
imp40 <- mice::mice.mids(imp,
maxit = 35,
print = FALSE)
plot(imp40, c("fpl",
"age",
"age_f",
"sex",
"race_eth"))See Van Buuren chapter 6.6. The mice package contains a few functions to help graphically assess the differences between the observed and imputed data.
Discrepancies may be due to the imputation model, missingness assumption, or both.
Not a lot out there for categorical variables. It’s not ideal but you can use the densityplot function on categorical variables to get and idea..
I’ve made plots on my own which gets a little hard to explain. The general idea is to
# Add a column to the data to indicate if there were missing values for sex
missing_sex <- data %>%
dplyr::select(pat_id, sex) %>%
naniar::add_any_miss(data = .,
sex,
missing = "imputed",
complete = "observed")
# Now have an indicator for missing age.
# produces a data set where imputed data sets are stacked vertically.
imp_long <- mice::complete(imp, "long")
# Join the missing indicator to the long data
imp_long <- missing_sex %>%
dplyr::select(pat_id, any_miss_vars) %>%
dplyr::left_join(.,
imp_long,
by = "pat_id")
# Similar to the strip plot idea
ggplot() +
geom_jitter(data = imp_long %>% dplyr::filter(any_miss_vars == "observed"),
aes(x = .imp,
y = sex),
width = 0.10,
height = 0.25,
colour = "goldenrod",
alpha = 0.4) +
geom_jitter(data = imp_long %>% dplyr::filter(any_miss_vars == "imputed"),
aes(x = .imp,
y = sex),
width = 0.10,
height = 0.25,
colour = "darkblue",
alpha = 0.7)# Plotted another way
ggplot(data = imp_long,
aes(x = sex)) +
geom_bar(stat = "count") +
facet_grid(.imp ~ any_miss_vars)Once you are settled on the imputation model and reviewed the process for convergence, you are ready to create the multiple imputations. Double check your inputs, increase the number of imputations to perform.
imp_final <- mice::mice(data = data,
m = 40, # number of imputations, dflt = 5
method = meth, # specify the method
predictorMatrix = pred_guess,
visitSequence = visit_order,
seed = seed_for_imp,
maxit = 20, # number of iterations, dflt = 5
print = FALSE)
# user system elapsed
# 450.477 61.924 515.250Save the results to avoid having to re-run the multiple imputations.
We will use final imputation data imp_final for the rest of the slides.
Rubin’s Rules for pooling were covered in Miguel’s slides.
miceHere we will look at:
miceHere we are interested in the agreement between the claims data and the EHR data.
We did not impute the claims and EHR, but we did impute demographics. So let’s compare stratified agreement with complete case data and with multiply imputed data.
Agreement along the diagonal: Both sources say “Yes” or “No”. Kappa is considered “chance-adjusted” agreement.
Chosen here because it was of interest with the original study, but, also, because we will not transform kappa before pooling: simpler example.
## breast_ehr
## breast_claims Yes No
## Yes 657 639
## No 569 5894
complete_kappa_res <- tab %>%
psych::cohen.kappa() %>%
broom::tidy() %>%
janitor::clean_names() %>%
dplyr::filter(type == "unweighted") %>%
dplyr::select(kappa = estimate,
lower_ci = conf_low,
upper_ci = conf_high)
complete_kappa_res## # A tibble: 1 x 3
## kappa lower_ci upper_ci
## <dbl> <dbl> <dbl>
## 1 0.428 0.401 0.455
Need to get the estimate and standard error for the statistic of interest from each imputed data set.
I’ve found that it’s best to create a wrapper function and test it on the original data first
# Wrapper function to
# Filter the data set
# Calculate Kappa and the asymptotic SE
calc_kappa <- function(data, x, y) {
data_filtered <- data %>%
dplyr::filter(sex == "Female")
tab <- table(data_filtered[[x]], data_filtered[[y]])
k_res <- psych::cohen.kappa(x = tab)
tibble::tibble(n = k_res$n.obs,
kappa = k_res$kappa,
se = sqrt(k_res$var.kappa) # Confirmed correct with vcd package
)
}## # A tibble: 1 x 3
## n kappa se
## <int> <dbl> <dbl>
## 1 7759 0.428 0.0137
Obtain the estimate and standard error for each imputed data set
# Set up for pooling
m <- imp_final$m
Q <- rep(NA, m)
U <- rep(NA, m)
for (i in 1:m) {
kappa_res <- calc_kappa(data = complete(imp_final, i),
x = "breast_claims",
y = "breast_ehr")
Q[i] <- kappa_res$kappa
U[i] <- (kappa_res$se) ^ 2 # (standard error of estimate)^2
}## [1] 0.4438936 0.4443171 0.4439941 0.4440877 0.4440158 0.4440877 0.4439726
## [8] 0.4439870 0.4440014 0.4440302 0.4440903 0.4441021 0.4437642 0.4442169
## [15] 0.4441164 0.4446541 0.4445511 0.4440014 0.4445356 0.4440302 0.4438936
## [22] 0.4442488 0.4448959 0.4444211 0.4440733 0.4442985 0.4441595 0.4437561
## [29] 0.4439438 0.4439582 0.4446087 0.4440589 0.4440014 0.4440514 0.4436344
## [36] 0.4440080 0.4443680 0.4444067 0.4444827 0.4440014
## [1] 0.0001517263 0.0001516603 0.0001516851 0.0001517544 0.0001517839
## [6] 0.0001517544 0.0001518016 0.0001517957 0.0001517898 0.0001517780
## [11] 0.0001518658 0.0001517485 0.0001517794 0.0001517014 0.0001517426
## [16] 0.0001517467 0.0001518922 0.0001517898 0.0001516829 0.0001517780
## [21] 0.0001517263 0.0001518007 0.0001517505 0.0001517300 0.0001517603
## [26] 0.0001516726 0.0001517250 0.0001518905 0.0001518135 0.0001518075
## [31] 0.0001518685 0.0001517662 0.0001517898 0.0001516616 0.0001518327
## [36] 0.0001518949 0.0001517470 0.0001517358 0.0001517000 0.0001517898
# A more "tidyverse-y" way
imp_final %>%
mice::complete("all") %>%
purrr::map_dfr(.x = .,
.f = ~ calc_kappa(data = .,
x = "breast_claims",
y = "breast_ehr"),
.id = "m") %>%
mutate(Q = kappa,
U = se ^ 2) ## # A tibble: 40 x 6
## m n kappa se Q U
## <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 8678 0.444 0.0123 0.444 0.000152
## 2 2 8690 0.444 0.0123 0.444 0.000152
## 3 3 8685 0.444 0.0123 0.444 0.000152
## 4 4 8674 0.444 0.0123 0.444 0.000152
## 5 5 8669 0.444 0.0123 0.444 0.000152
## 6 6 8674 0.444 0.0123 0.444 0.000152
## 7 7 8666 0.444 0.0123 0.444 0.000152
## 8 8 8667 0.444 0.0123 0.444 0.000152
## 9 9 8668 0.444 0.0123 0.444 0.000152
## 10 10 8670 0.444 0.0123 0.444 0.000152
## # … with 30 more rows
mice::pool.scalarPass the estimates and standard errors to mice::pool.scalar
## [1] 0.444143
## [1] 0.4199912 0.4682948
Per ?mice::pool.scalar, returns a list with components:
m is the number of imputations.qhat contains the m univariate estimates of repeated complete data analyses.u contains the corresponding m variances of the univariate estimates.qbar is the pooled univariate estimate, formula (3.1.2) Rubin (1987).ubar is the mean of the variances (i.e. the pooled within-imputation variance), formula (3.1.3) Rubin (1987).b is the between-imputation variance, formula (3.1.4) Rubin (1987).t is the total variance of the pooled estimated, formula (3.1.5) Rubin (1987).r is the relative increase in variance due to nonresponse, formula (3.1.7) Rubin (1987).df is the degrees of freedom for t reference distribution, formula (3.1.6) Rubin (1987) or method of Barnard-Rubin (1999) (if method = “smallsample”).Componentfmi` is the fraction missing information due to nonresponse, formula (3.1.10) Rubin (1987).combined_kappa_res <- tibble::tibble(kappa = pooled_est$qbar,
lower_ci = pooled_est$qbar + -1.96 * sqrt(pooled_est$t),
upper_ci = pooled_est$qbar + 1.96 * sqrt(pooled_est$t)) %>%
dplyr::bind_rows(.,
complete_kappa_res) %>%
mutate(scenario = c("mi", "complete"))
combined_kappa_res## # A tibble: 2 x 4
## kappa lower_ci upper_ci scenario
## <dbl> <dbl> <dbl> <chr>
## 1 0.444 0.420 0.468 mi
## 2 0.428 0.401 0.455 complete
ggplot(data = combined_kappa_res,
aes(x = kappa,
y = scenario)) +
geom_errorbar(aes(xmin = lower_ci, xmax = upper_ci),
width = 0.05) +
geom_point(size = 2) +
scale_x_continuous(limits = c(0, 1))\[\bar{\theta} = \frac{1}{m} \sum_{m=1}^{m} \hat{\theta}_{m}\]
We have all the estimates that we need to do this by hand using Rubin’s Rules
## [1] 40
## [1] 0.4438936 0.4443171 0.4439941 0.4440877 0.4440158 0.4440877 0.4439726
## [8] 0.4439870 0.4440014 0.4440302 0.4440903 0.4441021 0.4437642 0.4442169
## [15] 0.4441164 0.4446541 0.4445511 0.4440014 0.4445356 0.4440302 0.4438936
## [22] 0.4442488 0.4448959 0.4444211 0.4440733 0.4442985 0.4441595 0.4437561
## [29] 0.4439438 0.4439582 0.4446087 0.4440589 0.4440014 0.4440514 0.4436344
## [36] 0.4440080 0.4443680 0.4444067 0.4444827 0.4440014
## [1] 0.444143
This is the same as we saw above
## [1] 0.444143
\[\bar{V} = \frac{1}{m} \sum_{m=1}^{m} V_{m}\]
## [1] 0.0001517263 0.0001516603 0.0001516851 0.0001517544 0.0001517839
## [6] 0.0001517544 0.0001518016 0.0001517957 0.0001517898 0.0001517780
## [11] 0.0001518658 0.0001517485 0.0001517794 0.0001517014 0.0001517426
## [16] 0.0001517467 0.0001518922 0.0001517898 0.0001516829 0.0001517780
## [21] 0.0001517263 0.0001518007 0.0001517505 0.0001517300 0.0001517603
## [26] 0.0001516726 0.0001517250 0.0001518905 0.0001518135 0.0001518075
## [31] 0.0001518685 0.0001517662 0.0001517898 0.0001516616 0.0001518327
## [36] 0.0001518949 0.0001517470 0.0001517358 0.0001517000 0.0001517898
## [1] 0.000151768
## [1] 0.000151768
\[B = \frac{1}{m - 1} \sum_{m=1}^{m} {(\hat{\theta}_{m} - \bar{\theta})} ^ 2\]
## [1] 7.044579e-08
## [1] 7.044579e-08
\[T = \bar{V} + (1 + {m}^{-1})B\]
## [1] 0.0001518402
## [1] 0.0001518402
Trick with odds ratio is to calculate the log-odds, then pool, then exponentiate.
# Simple function to calculate the log(odds ratio) and approximate standard error
calc_ln_or <- function(data, x, y) {
tab <- table(data[[x]], data[[y]])
n_11 <- tab[1, 1]
n_12 <- tab[1, 2]
n_21 <- tab[2, 1]
n_22 <- tab[2, 2]
# or <- (n_11 * n_22) / (n_12 * n_21)
# Flipping the odds ratio to have the reference levels match
or <- (n_12 * n_21) / (n_11 * n_22)
ln_or <- log(or)
# approximate standard error for ln(or)
se_ln_or <- sqrt((1 / n_11) + (1 / n_12) + (1 / n_21) + (1 / n_22))
tibble::tibble(ln_or = ln_or,
se_ln_or = se_ln_or)
}## [1] 1.065003
## [1] 0.9815445 1.1555579
# Set up for pooling
m <- imp_final$m
Q <- rep(NA, m)
U <- rep(NA, m)
for (i in 1:m) {
ln_or_res <- calc_ln_or(data = complete(imp_final, i),
x = "sex",
y = "flu_ehr")
Q[i] <- ln_or_res$ln_or
U[i] <- (ln_or_res$se_ln_or) ^ 2 # (standard error of estimate)^2
}mice::pool.scalar## $m
## [1] 40
##
## $qhat
## [1] 0.04354367 0.04660435 0.03017746 0.03842249 0.02817529 0.03226535
## [7] 0.03587783 0.02715582 0.04304835 0.04099474 0.04050883 0.04354876
## [13] 0.03433171 0.04148236 0.03944401 0.04405491 0.04051502 0.03843606
## [19] 0.03479434 0.03176402 0.03738711 0.03535999 0.04354367 0.03532888
## [25] 0.04252689 0.02759346 0.03943759 0.02362538 0.04100668 0.03382990
## [31] 0.05329689 0.04355386 0.02920490 0.03993000 0.04203866 0.03130897
## [37] 0.05789426 0.04097694 0.03583219 0.03535999
##
## $u
## [1] 0.001538653 0.001540183 0.001541970 0.001538795 0.001539600 0.001539779
## [7] 0.001537865 0.001539430 0.001537066 0.001537721 0.001535643 0.001538152
## [13] 0.001538611 0.001539817 0.001538967 0.001538740 0.001535148 0.001537792
## [19] 0.001541730 0.001539189 0.001539629 0.001538280 0.001538653 0.001540295
## [25] 0.001537979 0.001543070 0.001539471 0.001537334 0.001536723 0.001538024
## [31] 0.001533831 0.001537651 0.001539267 0.001541074 0.001535899 0.001536104
## [37] 0.001536093 0.001539226 0.001540887 0.001538280
##
## $qbar
## [1] 0.03810454
##
## $ubar
## [1] 0.001538566
##
## $b
## [1] 4.721021e-05
##
## $t
## [1] 0.001586956
##
## $df
## [1] 21.53355
##
## $r
## [1] 0.03145167
##
## $fmi
## [1] 0.1095279
Transform the estimates (in this case, exponentiate)
## [1] 1.03884
## [1] 0.9608132 1.1232029
or_lr_cc <- glm((flu_ehr == "Yes") ~ sex,
family = binomial(link = "logit"),
data = data) %>%
broom::tidy(conf.int = TRUE, exponentiate = TRUE)
or_lr_cc## # A tibble: 2 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.451 0.0245 -32.4 8.72e-231 0.430 0.474
## 2 sexMale 1.07 0.0416 1.51 1.30e- 1 0.981 1.16
# Fit model for each imputed data set
or_lr_mi_fit <- with(imp_final,
glm((flu_ehr == "Yes") ~ sex,
family = binomial(link = "logit")))
# Pool the results with mice
or_lr_mi <- mice::pool(or_lr_mi_fit)
# Clean up the results
or_lr_mi <- summary(or_lr_mi, conf.int = TRUE, exponentiate = TRUE)
or_lr_mi## term estimate std.error statistic df p.value 2.5 %
## 1 (Intercept) 0.4843068 0.02302561 -31.4882767 12492.251 0.000000 0.4629342
## 2 sexMale 1.0388398 0.03983661 0.9565207 9746.982 0.338833 0.9608053
## 97.5 %
## 1 0.5066662
## 2 1.1232121
Not the prettiest way to get there, but here the odds ratio results are visulized for each scenario.
tibble::tribble(
~scenario, ~or, ~lower_ci, ~upper_ci,
"Complete case, 2x2 table", exp(or_cc$ln_or), exp(or_cc$ln_or - 1.96 * or_cc$se_ln_or), exp(or_cc$ln_or + 1.96 * or_cc$se_ln_or),
"MI, 2x2 table", exp(pooled_est$qbar), exp(pooled_est$qbar - 1.96 * sqrt(pooled_est$t)), exp(pooled_est$qbar + 1.96 * sqrt(pooled_est$t)),
"Complete case, logistic reg", or_lr_cc[2, "estimate"][[1]], or_lr_cc[2, "conf.low"][[1]], or_lr_cc[2, "conf.high"][[1]],
"MI, logistic reg", or_lr_mi[2, "estimate"], or_lr_mi[2, "2.5 %"], or_lr_mi[2, "97.5 %"]
) %>%
ggplot(data = .,
aes(x = or,
y = scenario)) +
geom_errorbar(aes(xmin = lower_ci, xmax = upper_ci),
width = 0.05) +
geom_point(size = 2) +
scale_x_continuous(trans = "log10",
breaks = seq(0, 5, 1),
limits = c(0.1, 5)) +
labs(x = "Odds ratio (log scale)",
y = "scenario")